{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h1> Scrape US 115th congress </h1>\n",
    "This notebooks guides you through the process of scraping the 115th US Congress' members' tweets\n",
    "until September 26, 2018.\n",
    "\n",
    "<h2> Table of Contents </h2>\n",
    "\n",
    "<ol>\n",
    "<li> Preamble\n",
    "<li> Download Twitter Scraper\n",
    "<li> Scrape Tweets\n",
    "</ol>\n",
    "\n",
    "<b> 1. Preamble </b>\n",
    "\n",
    "Scrape US Senators' twitter accounts from [this page](https://www.socialseer.com/resources/us-senator-twitter-accounts/) (accessed September 27, 2018).\n",
    "\n",
    "<i> NOTE SenatorSanders is an official yet unused Twitter account by Senator Bernie Sanders, we substitute it with SenSanders, which is the account in use! NOTE that the page also offers SenBennetCO, which we substituted with SenatorBennet.</i>\n",
    "\n",
    "<b> 2. Download Twitter Scraper </b>\n",
    "\n",
    "We use the Twitter scraper by [taspinar](https://github.com/taspinar/twitterscraper).\n",
    "\n",
    "<i> NOTE that this library uses Python3! <i>\n",
    "\n",
    "<b> 3. Scrape Tweets </b>\n",
    "\n",
    "We will download all tweets by the list of account names of the senators, and afterwards remove all tweets that have been posted after a designated end date."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "import subprocess\n",
    "import json, re, os\n",
    "import datetime\n",
    "\n",
    "\n",
    "def load_account_names(filepath):\n",
    "    \"\"\"\n",
    "    load usernames from a file in the format: \n",
    "        senator_username, Party\n",
    "    and outputs a list of Twitter account names\n",
    "    requires:\n",
    "    [filepath] : list of account names, and their respective label (party name) \n",
    "    \"\"\"\n",
    "    result = []\n",
    "    with open(filepath, mode = 'r') as f:\n",
    "        for line in f:\n",
    "            line = line.replace('\\n', '')\n",
    "            line = line.split(',')[0]\n",
    "            result.append(line)\n",
    "    return result\n",
    "\n",
    "\n",
    "def scrape_tweets(username, outpath):\n",
    "    \"\"\"scrape tweets for a given Twitter username,\n",
    "    if the error code '0' is reported: this means success \n",
    "    it is not possible to set beginning and ending of this when you specify a user to scrape\n",
    "    requires:\n",
    "    [username] : Twitter username (e.g. realDonaldTrump)\n",
    "    [outpath]  : name of the output file (.json)\n",
    "    \"\"\"\n",
    "    # run twitterscraper on a given username\n",
    "    subprocess.call(['twitterscraper', username, '-u', '-o', outpath])\n",
    "    return\n",
    "    \n",
    "    \n",
    "def write_history(string, outpath):\n",
    "    \"\"\"\n",
    "    write a string to a given filepath\n",
    "    requires:\n",
    "    [string] : string to write into the file\n",
    "    [outpath] : output file path\n",
    "    \"\"\"\n",
    "    with open(outpath, mode='w', encoding = 'utf-8') as f:\n",
    "        f.write(string)\n",
    "        \n",
    "\n",
    "def extract_tweet_text(usernames, tweet_folder, min_day=1, min_month = 1, min_year = 2000, max_day = 31, max_month=12, max_year=3000):\n",
    "    \"\"\"\n",
    "    extract the plain text of a list of usernames' scraped json twitter histories\n",
    "    between [min_year, min_month, min_day] and [max_year, max_month, max_day]\n",
    "    requires:\n",
    "    [usernames]:\n",
    "    [tweet_folder]\n",
    "    \n",
    "    \"\"\"\n",
    "    min_date = datetime.date(min_year, min_month, min_day)\n",
    "    max_date = datetime.date(max_year, max_month, max_day)\n",
    "    \n",
    "    # collect all tweets per username\n",
    "    user_tweets = []\n",
    "\n",
    "    # use the json.loads method to load a json from a string\n",
    "    for username in usernames:\n",
    "        filepath = os.path.join(tweet_folder,username + '_twitter.json')\n",
    "        # read a .json file scraped from twitter; python treats json as a string\n",
    "        with open(filepath, mode = 'r') as f:\n",
    "            twitter_json = json.load(f)\n",
    "\n",
    "        # concatenate all twitter messages\n",
    "        all_tweets = \"\"\n",
    "\n",
    "        # for all tweets\n",
    "        for x in twitter_json:\n",
    "            year, month, day = [int(el) for el in x['timestamp'].split('T')[0].split('-')]\n",
    "            date_item = datetime.date(year, month, day)\n",
    "            # if the tweet date is between min_date and max_date..\n",
    "            if (min_date <= date_item) and (date_item <= max_date):\n",
    "                # take the tweet text\n",
    "                tweet = x['text']\n",
    "                # replace new-lines (\\n) and carriage returns (\\r)\n",
    "                tweet = tweet.replace('\\n',' ').replace('\\r','') + \" \"\n",
    "                # delete web-links\n",
    "                tweet = re.sub(r\"http\\S+\" , \"\" , tweet)\n",
    "                # perhaps it makes sense to remove pic.* tokens also!\n",
    "                ## place_holder\n",
    "                # add the tweets\n",
    "                all_tweets += tweet\n",
    "        user_tweets.append(all_tweets)\n",
    "    \n",
    "    # write the raw text into .txt files [username.txt]\n",
    "    for username, history in zip(usernames,user_tweets):\n",
    "        filepath = os.path.join(tweet_folder, username+'.txt')\n",
    "        write_history(history, filepath)\n",
    "\n",
    "        \n",
    "\n",
    "def load_label_file(label_file):\n",
    "    \"\"\"\n",
    "    load a labelfile in the format: \n",
    "        name, label [e.g. \"JohnBoozman, Republican\"]\n",
    "    and output is as a list of duples (name,label)\n",
    "    requires:\n",
    "        [label_file] : path to the label file\n",
    "    \"\"\"\n",
    "    mapping = [] \n",
    "    \n",
    "    with open(label_file, mode = \"r\") as f:\n",
    "        for line in f:\n",
    "            # extract duple: [name, label]\n",
    "            duple = line.replace(\" \",\"\").replace(\"\\n\",\"\").replace(\"\\r\",\"\").split(',')\n",
    "            mapping.append(duple)\n",
    "            \n",
    "    return mapping\n",
    "\n",
    "\n",
    "def sort_files(label_file, file_dir):\n",
    "    \"\"\"\n",
    "    sort files that are located in file_dir accoring to the label_file\n",
    "    requires:\n",
    "    [label_file]: label_files that contains the true labels of the underlying files\n",
    "    [file_dir]  : path to the directory containing files to be sorted into other folders\n",
    "    \"\"\"\n",
    "    # load label mapping\n",
    "    mapping = load_label_file(label_file)\n",
    "\n",
    "    \n",
    "    for map in mapping:\n",
    "        file_name = map[0]\n",
    "        label     = map[1] \n",
    "        \n",
    "        file_name_temp1 = os.path.join(file_dir,file_name + \".txt\")\n",
    "        file_name_temp2 = os.path.join(file_dir,file_name + \"_twitter.json\")\n",
    "        \n",
    "        # if \"Republican\"\n",
    "        if label == \"Republican\":\n",
    "            subprocess.call([\"mv\", file_name_temp1, os.path.join(file_dir,\"Republican\") ])\n",
    "            subprocess.call([\"mv\", file_name_temp2, os.path.join(file_dir,\"Republican\") ])\n",
    "        # elif \"Democrat\"\n",
    "        elif label == \"Democrat\":\n",
    "            subprocess.call([\"mv\", file_name_temp1, os.path.join(file_dir,\"Democrat\") ])\n",
    "            subprocess.call([\"mv\", file_name_temp2, os.path.join(file_dir,\"Democrat\") ])\n",
    "        # elif \"Independent\"\n",
    "        elif label == \"Independent\":\n",
    "            subprocess.call([\"mv\", file_name_temp1, os.path.join(file_dir,\"Independent\") ])\n",
    "            subprocess.call([\"mv\", file_name_temp2, os.path.join(file_dir,\"Independent\") ])\n",
    "        else:\n",
    "            print(\"Label unknown\")\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set path to the file holding labels and account names\n",
    "label_file = \"US115thcongress/senator_twitter_account_names_115th_congress_20180926\"\n",
    "# load accounts\n",
    "usernames = load_account_names(label_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set output directory for the tweet json files\n",
    "outpath = \"tweets\"\n",
    "\n",
    "# scrape all accounts\n",
    "for username in usernames:\n",
    "    print(\"Scrape {}\".format(username))\n",
    "    scrape_tweets(username, os.path.join(outpath, username + '_twitter.json'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "# extract all plain texts from the tweets before (max_year, max_month, max_day)\n",
    "extract_tweet_text(usernames, outpath, max_day = 26, max_month = 9, max_year = 2018)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sort all files in the tweets folder\n",
    "label_dir  = \"US115thcongress\"\n",
    "label_file = \"US115thcongress/senator_twitter_account_names_115th_congress_20180926\"\n",
    "\n",
    "# make label folders\n",
    "subprocess.call([\"mkdir\", os.path.join(label_dir,\"Independent\")])\n",
    "subprocess.call([\"mkdir\", os.path.join(label_dir,\"Democrat\")])\n",
    "subprocess.call([\"mkdir\", os.path.join(label_dir,\"Republican\")])\n",
    "\n",
    "# sort files\n",
    "sort_files(label_file, label_dir)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
